outline

  1. structure
  2. Data: dataset, preprocessing, visualisation,
  3. priminarly examination: paired correlation and spatial correlation, piared R squared, scatterplot
  4. machine learning methods: model parameter tunning, interpretation
This tutorial shows from data exploration to the modelling process for the global air pollution modelling project. The statistical learning methods used include Lasso, random forest, stochastic gradient boosting, extreme gradient boosting. The partial dependence plot and variable importance are visualised to partly interpretate models.

Required packages

ipak <- function(pkg){
 
   new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
   if (length(new.pkg)) 
    install.packages(new.pkg, dependencies = TRUE)
  sapply(pkg, require, character.only = TRUE)
}
#repos='http://cran.muenster.r-project.org'

stdata = c("sp", "sf", "raster")
Stat_methods = c("glmnet", "ranger", "gbm", "xgboost", "party", "caret", "gstat")
visual = c("RColorBrewer", "ggplot2", "corrplot", "tmap", "leaflet", "mapview","leafem", "pdp", "vip", "DT", "sparkline")
map = c("maptools")
tidy = c("devtools", "dplyr",  "tidyr", "reshape2", "knitr")
other = c("countrycode", "htmlwidgets", "data.table", "Matrix", "GGally")
 

packages <- c(stdata, tidy, Stat_methods, visual, map, other)
ipak(packages)
##           sp           sf       raster     devtools        dplyr 
##         TRUE         TRUE         TRUE         TRUE         TRUE 
##        tidyr     reshape2        knitr       glmnet       ranger 
##         TRUE         TRUE         TRUE         TRUE         TRUE 
##          gbm      xgboost        party        caret        gstat 
##         TRUE         TRUE         TRUE         TRUE         TRUE 
## RColorBrewer      ggplot2     corrplot         tmap      leaflet 
##         TRUE         TRUE         TRUE         TRUE         TRUE 
##      mapview       leafem          pdp          vip           DT 
##         TRUE         TRUE         TRUE         TRUE         TRUE 
##    sparkline     maptools  countrycode  htmlwidgets   data.table 
##         TRUE         TRUE         TRUE         TRUE         TRUE 
##       Matrix       GGally 
##         TRUE         TRUE

Auxilary package with preprocessed data in dataframe.

install_github("mengluchu/globalLUR/globalLUR/globalLUR")
library(globalLUR)
ls("package:globalLUR")
##  [1] "Brt_imp"          "Brt_LUR"          "cforest_LUR"     
##  [4] "countrywithppm"   "create_ring"      "ctree_LUR"       
##  [7] "error_matrix"     "join_by_id"       "Lasso"           
## [10] "Lassoselected"    "mechanical"       "merge_roads"     
## [13] "merged"           "mergeraster2file" "plot_error"      
## [16] "plot_rsq"         "ppm2ug"           "RDring_coef"     
## [19] "removedips"       "rf_imp"           "rf_LUR"          
## [22] "sampledf"         "scatterplot"      "subset_grep"     
## [25] "univar_rsq"       "xgboost_imp"      "xgboost_LUR"

If the above is not successeful for MacOS users, the following and a restart of R may be needed

system('defaults write org.R-project.R force.LANG en_US.UTF-8')

Preparation

Get 3 color pallets.

colorB = brewer.pal(7, "Greens")
display.brewer.pal(7,"Greens")

colorG = brewer.pal(11, "PiYG")
colorS = brewer.pal(11, "Spectral")
#set.seed(2)

Dataset:

  1. value_mean: annual mean \(NO_2\) (\(mg/m^3\)).
  2. day_value: annual mean \(NO_2\) (\(mg/m^3\)) at daytime.
  3. night_value: annual mean \(NO_2\) (\(mg/m^3\)) at night time.

  4. road_XX_size: road lenght within a buffer with radius “size” of type XX. ROAD_1: highway, ROAD_2: primary, ROAD_3: secondary, ROAD_4: tertiary, ROAD_5: unpaved
  5. I_size: Industrial area within a buffer with radius “size”.
  6. Tropomi_2018: TROPOMI averaged over Feb 2018 - Jan 2019.
  7. temperature_2m_m: monthly mean temperature at 2m height of month “m”.
  8. wind_speed_10m_m:monthly mean wind speed at 2m height of month “m”.
  9. pop1k/ 3k /5k: population 1, 3, 5 km resolution.
  10. Rsp: Surface remote sensing and chemical transport model product.
  11. OMI_mean_filt: OMI column density, 2017 annual average.

# add data usethis::use_data()
data("merged")
data("countrywithppm") # countries with ppm (parts per million)
names(merged)
##  [1] "LATITUDE"          "LONGITUDE"         "value_mean"       
##  [4] "OMI_mean_filt"     "elevation"         "pop1k"            
##  [7] "RSp"               "temperature_2m_1"  "temperature_2m_10"
## [10] "temperature_2m_11" "temperature_2m_12" "temperature_2m_2" 
## [13] "temperature_2m_3"  "temperature_2m_4"  "temperature_2m_5" 
## [16] "temperature_2m_6"  "temperature_2m_7"  "temperature_2m_8" 
## [19] "temperature_2m_9"  "wind_speed_10m_1"  "wind_speed_10m_10"
## [22] "wind_speed_10m_11" "wind_speed_10m_12" "wind_speed_10m_2" 
## [25] "wind_speed_10m_3"  "wind_speed_10m_4"  "wind_speed_10m_5" 
## [28] "wind_speed_10m_6"  "wind_speed_10m_7"  "wind_speed_10m_8" 
## [31] "wind_speed_10m_9"  "pop5k"             "pop3k"            
## [34] "country"           "ROAD_1_25"         "ROAD_1_50"        
## [37] "ROAD_1_100"        "ROAD_1_300"        "ROAD_1_500"       
## [40] "ROAD_1_800"        "ROAD_1_1000"       "ROAD_1_3000"      
## [43] "ROAD_1_5000"       "ROAD_2_25"         "ROAD_2_50"        
## [46] "ROAD_2_100"        "ROAD_2_300"        "ROAD_2_500"       
## [49] "ROAD_2_800"        "ROAD_2_1000"       "ROAD_2_3000"      
## [52] "ROAD_2_5000"       "ROAD_3_25"         "ROAD_3_50"        
## [55] "ROAD_3_100"        "ROAD_3_300"        "ROAD_3_500"       
## [58] "ROAD_3_800"        "ROAD_3_1000"       "ROAD_3_3000"      
## [61] "ROAD_3_5000"       "ROAD_4_25"         "ROAD_4_50"        
## [64] "ROAD_4_100"        "ROAD_4_300"        "ROAD_4_500"       
## [67] "ROAD_4_800"        "ROAD_4_1000"       "ROAD_4_3000"      
## [70] "ROAD_4_5000"       "ROAD_5_25"         "ROAD_5_50"        
## [73] "ROAD_5_100"        "ROAD_5_300"        "ROAD_5_500"       
## [76] "ROAD_5_800"        "ROAD_5_1000"       "ROAD_5_3000"      
## [79] "ROAD_5_5000"       "I_1_25"            "I_1_50"           
## [82] "I_1_100"           "I_1_300"           "I_1_500"          
## [85] "I_1_800"           "I_1_1000"          "I_1_3000"         
## [88] "I_1_5000"          "Tropomi_2018"      "day_value"        
## [91] "night_value"       "ID"
merged %>% dplyr::select(value_mean, day_value, night_value) %>% summary()
##    value_mean       day_value       night_value      
##  Min.   : 1.559   Min.   : 1.883   Min.   :  0.4307  
##  1st Qu.:17.262   1st Qu.:18.353   1st Qu.: 15.4550  
##  Median :26.164   Median :28.433   Median : 22.3804  
##  Mean   :27.905   Mean   :30.912   Mean   : 23.5377  
##  3rd Qu.:36.883   3rd Qu.:41.900   3rd Qu.: 29.6922  
##  Max.   :93.885   Max.   :94.266   Max.   :100.5855  
##                   NA's   :5        NA's   :1
#datatable(merged, rownames = FALSE, filter = "top", options = list(pageLength = 5, scrollX = T))

Fill missing data with NA

merged_1 = na_if(merged, -1)

Merge roads of different road types, here 3, 4, 5, the road length of these road types are aggregated. The original road types are substituted (with keep = T, they are remained).

merged_mr = merge_roads(merged_1, c(3, 4, 5), keep = F) # keep = T keeps the original roads. 
names(merged_mr)
##  [1] "ROAD_M345_25"      "ROAD_M345_50"      "ROAD_M345_100"    
##  [4] "ROAD_M345_300"     "ROAD_M345_500"     "ROAD_M345_800"    
##  [7] "ROAD_M345_1000"    "ROAD_M345_3000"    "ROAD_M345_5000"   
## [10] "LATITUDE"          "LONGITUDE"         "value_mean"       
## [13] "OMI_mean_filt"     "elevation"         "pop1k"            
## [16] "RSp"               "temperature_2m_1"  "temperature_2m_10"
## [19] "temperature_2m_11" "temperature_2m_12" "temperature_2m_2" 
## [22] "temperature_2m_3"  "temperature_2m_4"  "temperature_2m_5" 
## [25] "temperature_2m_6"  "temperature_2m_7"  "temperature_2m_8" 
## [28] "temperature_2m_9"  "wind_speed_10m_1"  "wind_speed_10m_10"
## [31] "wind_speed_10m_11" "wind_speed_10m_12" "wind_speed_10m_2" 
## [34] "wind_speed_10m_3"  "wind_speed_10m_4"  "wind_speed_10m_5" 
## [37] "wind_speed_10m_6"  "wind_speed_10m_7"  "wind_speed_10m_8" 
## [40] "wind_speed_10m_9"  "pop5k"             "pop3k"            
## [43] "country"           "ROAD_1_25"         "ROAD_1_50"        
## [46] "ROAD_1_100"        "ROAD_1_300"        "ROAD_1_500"       
## [49] "ROAD_1_800"        "ROAD_1_1000"       "ROAD_1_3000"      
## [52] "ROAD_1_5000"       "ROAD_2_25"         "ROAD_2_50"        
## [55] "ROAD_2_100"        "ROAD_2_300"        "ROAD_2_500"       
## [58] "ROAD_2_800"        "ROAD_2_1000"       "ROAD_2_3000"      
## [61] "ROAD_2_5000"       "I_1_25"            "I_1_50"           
## [64] "I_1_100"           "I_1_300"           "I_1_500"          
## [67] "I_1_800"           "I_1_1000"          "I_1_3000"         
## [70] "I_1_5000"          "Tropomi_2018"      "day_value"        
## [73] "night_value"       "ID"
#numeric country
#inde_var$country=as.numeric(inde_var$country)

Visualization

Visualize with tmap: convenient

locations_sf = st_as_sf(merged_mr, coords = c("LONGITUDE","LATITUDE"))
osm_valuemean = tm_shape(locations_sf) +
  tm_dots( "value_mean", col = "value_mean", size = 0.05,
     popup.vars = c("value_mean", "day_value", "night_value", "ROAD_2_100", "ROAD_2_5000")) + tm_view(basemaps = c('OpenStreetMap'))
#+tm_shape(lnd)+tm_lines()
tmap_save(osm_valuemean, "NO2mean.html")

Visualize with leaflet: more control. show Day/night ratio, red: day/night >1, blue, day/nigh <1

merged_fp = merged_mr %>% mutate(ratiodn = day_value/night_value) %>% mutate(color = ifelse(ratiodn > 1, "red", "blue"))
m  = leaflet(merged_fp) %>%
     addTiles() %>% addCircleMarkers(radius = ~value_mean/5, color = ~color, popup =   ~as.character(value_mean),fill = FALSE) %>% addProviderTiles(providers$Esri.NatGeoWorldMap) %>% addMouseCoordinates() %>%  
addHomeButton(ext = extent(116.2, 117, 39.7, 40), layer.name = "Beijing") %>% addHomeButton(ext = extent(5, 5.2, 52, 52.2), layer.name = "Utrecht")
saveWidget(m, file = "NO2daynight.html")

Boxplot

countryname = paste(merged_mr$country, countrycode(merged_mr$country, 'iso2c', 'country.name'), sep = ":") 

#tag country with ppm 
countryname_s_e=ifelse( merged_mr$country %in% countrywithppm[countrywithppm %in% merged_mr$country], paste(countryname,"*", sep = ""), countryname)
merged_mr$countryfullname = countryname_s_e

# use the median for colour
mergedmedian = merged_mr %>% group_by(country) %>% mutate(median =  median(value_mean, na.rm = TRUE))

bp2 <- ggplot(mergedmedian, aes(x=countryfullname, y=value_mean, group=country)) +
  labs(x = "Country", y = expression(paste("NO"[2], "  ", mu, "g/", "m"^3)), cex = 1.5) +
  geom_boxplot(aes(fill = median)) + 
  theme(text = element_text(size = 13), axis.text.x = element_text(angle = 90, hjust = 1)) +   scale_fill_distiller(palette = "Spectral")
#   scale_color_brewer(direction = 1)
print(bp2 + ylim(0, 100))

Plot the paired correlation, for road predictors, population, Tropomi. For DE, CN, and world

merged_mr %>% na.omit %>% filter(country == "DE") %>% dplyr::select(matches("_value|ROAD|pop|Trop")) %>% cor %>% corrplot(type = "upper", method = "pie", tl.cex = 0.7)

merged_mr %>% na.omit %>% dplyr::select(matches("_value|ROAD|pop|Trop")) %>% cor %>% corrplot(type = "upper", method = "pie", tl.cex = 0.7)

Spatial dependency

grd_sp <- as_Spatial(locations_sf)
 
dt.vgm = variogram(value_mean~1, grd_sp)
plot(dt.vgm)

#Moran I test
#install.packages("ape", dependencies = TRUE)
#library(ape)

#merged_mrf =  merged_mr%>%filter(country == "US")
#no2.dists <- as.matrix(dist(cbind(merged_mrf$LONGITUDE, merged_mrf$LATITUDE)))
#no2.dists[1:5, 1:5]
#no2.inv <- 1/no2.dists
#diag(no2.inv) <- 0
#no2.inv[1:5, 1:5]
#Moran.I(merged_mrf$value_mean, na.rm = T, no2.inv) 
countryvariogram = function(COUN, cutoff){
loca =  locations_sf%>%filter(country == COUN)
grd_sp <- as_Spatial(loca)

dt.vgm = variogram(value_mean~1, grd_sp, cutoff = cutoff)
plot(dt.vgm)
}
countryvariogram("DE", 1)

countryvariogram("US", 1)

countryvariogram("CN", 1) # reason?

Data preprocessing:

  1. add variables by ID or by rasters (not in this document).
  2. remove unwanted columns or records,
  3. select records (e.g. by country), separate testing and training sets.

Separate the dataset into training and test dataset with a fraction (her 80% of the records are used for training, the rest for testing), “DE” is the two digit for germany. If for world, the sampling uses the fraction per country.

#merged = merge(merged, stat[,-which(names(stat)%in%c("LATITUTE", "LONGITUDE"))], by = "ID", all.x = T)

National model: Germany as an example

response_predictor = globalLUR::sampledf(merged_mr, fraction = 0.8, country2digit = "DE", grepstring_rm = "ID|LATITUDE|LONGITUDE|countryfullname")

#Retrieve test, training, and all variables.  
 
test = response_predictor$test
training = response_predictor$training
inde_var = response_predictor$inde_var
inde_var = inde_var %>% dplyr::select(-country)

The size of test and training dataset

length(test)
## [1] 67
length(training)
## [1] 267

The paired correlation between dependent (mean, day, night) and independent variables. How much information does R-squared tell you?

#Checkt uni-variant R square. Caculate the r-sq for day, night and mean, and bind the columns to form a dataframe for plotting.  
rsqmean =  inde_var %>% dplyr::select(-matches("value_mean|day_|night_")) %>%  univar_rsq (inde_var$value_mean)

rsqday = inde_var %>% dplyr::select(-matches("value_mean|day_|night_")) %>%  univar_rsq (inde_var$day_value) 
rsqnight = inde_var %>% dplyr::select(-matches("value_mean|day_|night_")) %>%  univar_rsq (inde_var$night_value) 

rsqdf = cbind(rsqmean, rsqday, rsqnight, rownames(rsqmean))  
names(rsqdf)= c("mean","day","night","vars")

plot_rsq(rsqdf = rsqdf, varname = "vars",xlab = "predictors", ylab = "R-squared")

#How does it compare to the vairable importance estimated from LASSO, RF, SGB, XGB, etc.

The scatter plots between predictors and responses, mean loess: moving regression, non-parametric, each smoothed value is given by a weighted linear least squares regression over the span.

inde_var %>% dplyr::select(matches("ROAD_M345_3000|pop3k|ROAD_2_50$|temperature_2m_7|day_value")) %>% scatterplot(y_name = "day_value", fittingmethod = "loess")

inde_var %>% dplyr::select(matches("Tro|OMI|Rsp|night_value")) %>% scatterplot(y_name = "night_value", fittingmethod = "loess")

# why?

# can choose any variable to look at the scatterplot

#inde_var %>% dplyr::select(matches("ROAD_1|day_value")) %>% scatterplot(y_name = "day_value", fittingmethod = "gam")

Discussion 1, try different scatterplot and discuss about the findings

inde_var %>% names
##  [1] "ROAD_M345_25"      "ROAD_M345_50"      "ROAD_M345_100"    
##  [4] "ROAD_M345_300"     "ROAD_M345_500"     "ROAD_M345_800"    
##  [7] "ROAD_M345_1000"    "ROAD_M345_3000"    "ROAD_M345_5000"   
## [10] "value_mean"        "OMI_mean_filt"     "elevation"        
## [13] "pop1k"             "RSp"               "temperature_2m_1" 
## [16] "temperature_2m_10" "temperature_2m_11" "temperature_2m_12"
## [19] "temperature_2m_2"  "temperature_2m_3"  "temperature_2m_4" 
## [22] "temperature_2m_5"  "temperature_2m_6"  "temperature_2m_7" 
## [25] "temperature_2m_8"  "temperature_2m_9"  "wind_speed_10m_1" 
## [28] "wind_speed_10m_10" "wind_speed_10m_11" "wind_speed_10m_12"
## [31] "wind_speed_10m_2"  "wind_speed_10m_3"  "wind_speed_10m_4" 
## [34] "wind_speed_10m_5"  "wind_speed_10m_6"  "wind_speed_10m_7" 
## [37] "wind_speed_10m_8"  "wind_speed_10m_9"  "pop5k"            
## [40] "pop3k"             "ROAD_1_25"         "ROAD_1_50"        
## [43] "ROAD_1_100"        "ROAD_1_300"        "ROAD_1_500"       
## [46] "ROAD_1_800"        "ROAD_1_1000"       "ROAD_1_3000"      
## [49] "ROAD_1_5000"       "ROAD_2_25"         "ROAD_2_50"        
## [52] "ROAD_2_100"        "ROAD_2_300"        "ROAD_2_500"       
## [55] "ROAD_2_800"        "ROAD_2_1000"       "ROAD_2_3000"      
## [58] "ROAD_2_5000"       "I_1_25"            "I_1_50"           
## [61] "I_1_100"           "I_1_300"           "I_1_500"          
## [64] "I_1_800"           "I_1_1000"          "I_1_3000"         
## [67] "I_1_5000"          "Tropomi_2018"      "day_value"        
## [70] "night_value"
inde_var %>% dplyr::select(matches("ROAD_1_500$|day_value")) %>% scatterplot(y_name = "day_value", fittingmethod = "gam")

Modelling

  1. Tree based
  2. Lasso
  3. Mechanical model (nls)

Extra: 5) Separate urban/rural hirachical/ two-step linear regression 6) mixed effects regression

LM: linear regression model

If simply using linear regression, the mean, day, night. Predictors are population, temperature, wind speed, GEOM product, OMI tropo column, elevation, and road buffers.

i.e. ROAD|population|value_mean|temperature|wind|GEOM product|OMI|elevation.

Note population is not always significant, though the individual R square for each buffer is high. The prediction for night is much better than for the day

inde_var_train = subset_grep(inde_var[training, ], "ROAD|pop|temp|wind|Rsp|OMI|eleva|coast|I_1|Tropomi|value_mean")
Regression tree

The tree and prediction error will be different if shuffeling training and testing data.

for (i in 2:5)
{
  set.seed(i)
  testtree = globalLUR::sampledf(merged_mr,fraction = 0.8, "DE" )
  with (testtree, ctree_LUR(inde_var, y_varname= c("day_value"), training = training, test =  test, grepstring ="ROAD|pop|temp|wind|Rsp|OMI|eleva|coast|I_1|Tropomi" ))
}
random forest.

Creates diverse set of trees because 1) trees are unstable w.r.t. changes in learning/training data (bagging) 2) randomly preselect mtry splitting variables in each split

model training and parameter tuning

#caret
names(getModelInfo())
##   [1] "ada"                 "AdaBag"              "AdaBoost.M1"        
##   [4] "adaboost"            "amdai"               "ANFIS"              
##   [7] "avNNet"              "awnb"                "awtan"              
##  [10] "bag"                 "bagEarth"            "bagEarthGCV"        
##  [13] "bagFDA"              "bagFDAGCV"           "bam"                
##  [16] "bartMachine"         "bayesglm"            "binda"              
##  [19] "blackboost"          "blasso"              "blassoAveraged"     
##  [22] "bridge"              "brnn"                "BstLm"              
##  [25] "bstSm"               "bstTree"             "C5.0"               
##  [28] "C5.0Cost"            "C5.0Rules"           "C5.0Tree"           
##  [31] "cforest"             "chaid"               "CSimca"             
##  [34] "ctree"               "ctree2"              "cubist"             
##  [37] "dda"                 "deepboost"           "DENFIS"             
##  [40] "dnn"                 "dwdLinear"           "dwdPoly"            
##  [43] "dwdRadial"           "earth"               "elm"                
##  [46] "enet"                "evtree"              "extraTrees"         
##  [49] "fda"                 "FH.GBML"             "FIR.DM"             
##  [52] "foba"                "FRBCS.CHI"           "FRBCS.W"            
##  [55] "FS.HGD"              "gam"                 "gamboost"           
##  [58] "gamLoess"            "gamSpline"           "gaussprLinear"      
##  [61] "gaussprPoly"         "gaussprRadial"       "gbm_h2o"            
##  [64] "gbm"                 "gcvEarth"            "GFS.FR.MOGUL"       
##  [67] "GFS.LT.RS"           "GFS.THRIFT"          "glm.nb"             
##  [70] "glm"                 "glmboost"            "glmnet_h2o"         
##  [73] "glmnet"              "glmStepAIC"          "gpls"               
##  [76] "hda"                 "hdda"                "hdrda"              
##  [79] "HYFIS"               "icr"                 "J48"                
##  [82] "JRip"                "kernelpls"           "kknn"               
##  [85] "knn"                 "krlsPoly"            "krlsRadial"         
##  [88] "lars"                "lars2"               "lasso"              
##  [91] "lda"                 "lda2"                "leapBackward"       
##  [94] "leapForward"         "leapSeq"             "Linda"              
##  [97] "lm"                  "lmStepAIC"           "LMT"                
## [100] "loclda"              "logicBag"            "LogitBoost"         
## [103] "logreg"              "lssvmLinear"         "lssvmPoly"          
## [106] "lssvmRadial"         "lvq"                 "M5"                 
## [109] "M5Rules"             "manb"                "mda"                
## [112] "Mlda"                "mlp"                 "mlpKerasDecay"      
## [115] "mlpKerasDecayCost"   "mlpKerasDropout"     "mlpKerasDropoutCost"
## [118] "mlpML"               "mlpSGD"              "mlpWeightDecay"     
## [121] "mlpWeightDecayML"    "monmlp"              "msaenet"            
## [124] "multinom"            "mxnet"               "mxnetAdam"          
## [127] "naive_bayes"         "nb"                  "nbDiscrete"         
## [130] "nbSearch"            "neuralnet"           "nnet"               
## [133] "nnls"                "nodeHarvest"         "null"               
## [136] "OneR"                "ordinalNet"          "ordinalRF"          
## [139] "ORFlog"              "ORFpls"              "ORFridge"           
## [142] "ORFsvm"              "ownn"                "pam"                
## [145] "parRF"               "PART"                "partDSA"            
## [148] "pcaNNet"             "pcr"                 "pda"                
## [151] "pda2"                "penalized"           "PenalizedLDA"       
## [154] "plr"                 "pls"                 "plsRglm"            
## [157] "polr"                "ppr"                 "PRIM"               
## [160] "protoclass"          "qda"                 "QdaCov"             
## [163] "qrf"                 "qrnn"                "randomGLM"          
## [166] "ranger"              "rbf"                 "rbfDDA"             
## [169] "Rborist"             "rda"                 "regLogistic"        
## [172] "relaxo"              "rf"                  "rFerns"             
## [175] "RFlda"               "rfRules"             "ridge"              
## [178] "rlda"                "rlm"                 "rmda"               
## [181] "rocc"                "rotationForest"      "rotationForestCp"   
## [184] "rpart"               "rpart1SE"            "rpart2"             
## [187] "rpartCost"           "rpartScore"          "rqlasso"            
## [190] "rqnc"                "RRF"                 "RRFglobal"          
## [193] "rrlda"               "RSimca"              "rvmLinear"          
## [196] "rvmPoly"             "rvmRadial"           "SBC"                
## [199] "sda"                 "sdwd"                "simpls"             
## [202] "SLAVE"               "slda"                "smda"               
## [205] "snn"                 "sparseLDA"           "spikeslab"          
## [208] "spls"                "stepLDA"             "stepQDA"            
## [211] "superpc"             "svmBoundrangeString" "svmExpoString"      
## [214] "svmLinear"           "svmLinear2"          "svmLinear3"         
## [217] "svmLinearWeights"    "svmLinearWeights2"   "svmPoly"            
## [220] "svmRadial"           "svmRadialCost"       "svmRadialSigma"     
## [223] "svmRadialWeights"    "svmSpectrumString"   "tan"                
## [226] "tanSearch"           "treebag"             "vbmpRadial"         
## [229] "vglmAdjCat"          "vglmContRatio"       "vglmCumulative"     
## [232] "widekernelpls"       "WM"                  "wsrf"               
## [235] "xgbDART"             "xgbLinear"           "xgbTree"            
## [238] "xyf"
inde_var_train = subset_grep(inde_var[training, ], "ROAD|pop|temp|wind|Rsp|OMI|eleva|coast|I_1|Tropomi|value_mean")

Training RF using Caret

Mtry

model_rf = train(value_mean ~ ., data = inde_var_train, method='rf') # mtry
plot(model_rf)

fitted <- predict(model_rf, inde_var[test, ])
error_matrix(prediction = fitted, validation = inde_var[test, ]$value_mean)

custom a tunning grid for tunning and control the resampling method

trl <- trainControl(method = "cv", number = 3) # control the resampling method
 tgrid <- expand.grid(
  .mtry = 2:4,
  .splitrule = "variance",
  .min.node.size = c(10, 20)
)
model_rf = train(value_mean ~ ., data = inde_var_train, method='ranger', trControl  = trl, tuneGrid= tgrid)
model_rf
## Random Forest 
## 
## 267 samples
##  66 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold) 
## Summary of sample sizes: 178, 177, 179 
## Resampling results across tuning parameters:
## 
##   mtry  min.node.size  RMSE      Rsquared   MAE     
##   2     10             8.490272  0.6036633  6.492191
##   2     20             8.650491  0.6012141  6.676920
##   3     10             8.389327  0.6013799  6.319596
##   3     20             8.481678  0.6008361  6.417508
##   4     10             8.332224  0.5983231  6.262277
##   4     20             8.358657  0.6047600  6.293159
## 
## Tuning parameter 'splitrule' was held constant at a value of variance
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 4, splitrule =
##  variance and min.node.size = 10.

try after the work shop as it takes quite a while

model_gbm = train(value_mean ~ ., data = inde_var[training, ], method='gbm')
plot(model_gbm)
#gbm.step optimal number of trees. 

Also computational intensive

#install.packages("caretEnsemble")
library(caretEnsemble)

# Stacking Algorithms - Run multiple algos in one call.
trainControl <- trainControl(method = "repeatedcv", 
                             number = 10, 
                             repeats = 2,
                             savePredictions = TRUE, 
                             classProbs = TRUE)

algorithmList <- c('rf', 'adaboost', 'earth', 'xgbDART', 'svmRadial')

set.seed(100)
models <- caretList(value_mean ~ ., data = inde_var_train, trControl = trainControl, methodList = algorithmList) 
results <- resamples(models)
summary(results)

Important variables and Partial plots: using the “vip” package

set.seed(2)
vip::list_metrics()
##     Metric                       Description                      Task
## 1      auc            Area under (ROC) curve     Binary classification
## 2    error           Misclassification error     Binary classification
## 3  logloss                          Log loss     Binary classification
## 4     mauc Multiclass area under (ROC) curve Multiclass classification
## 5 mlogloss               Multiclass log loss Multiclass classification
## 6      mse                Mean squared error                Regression
## 7       r2                         R squared                Regression
## 8 rsquared                         R squared                Regression
## 9     rmse           Root mean squared error                Regression
pre_mat = subset_grep(inde_var_train, grepstring = "ROAD|pop|value_mean|temp|wind|eleva|coast|I_1|Trop")
rf = ranger(value_mean~ ., data = pre_mat, mtry = 33, num.trees = 2000, importance = "permutation")
rf
## Ranger result
## 
## Call:
##  ranger(value_mean ~ ., data = pre_mat, mtry = 33, num.trees = 2000,      importance = "permutation") 
## 
## Type:                             Regression 
## Number of trees:                  2000 
## Sample size:                      267 
## Number of independent variables:  65 
## Mtry:                             33 
## Target node size:                 5 
## Variable importance mode:         permutation 
## Splitrule:                        variance 
## OOB prediction error (MSE):       63.26486 
## R squared (OOB):                  0.6118315
# ranger method
importance(rf)
##      ROAD_M345_25      ROAD_M345_50     ROAD_M345_100     ROAD_M345_300 
##       2.071941955       0.786709144       2.757685453       6.931659591 
##     ROAD_M345_500     ROAD_M345_800    ROAD_M345_1000    ROAD_M345_3000 
##       1.080055444       0.416176479       1.654518898      39.570696705 
##    ROAD_M345_5000         elevation             pop1k  temperature_2m_1 
##       6.112117707       0.514998489       4.319145371       0.086869503 
## temperature_2m_10 temperature_2m_11 temperature_2m_12  temperature_2m_2 
##       0.128270849       0.117776079       0.060201905       0.255878002 
##  temperature_2m_3  temperature_2m_4  temperature_2m_5  temperature_2m_6 
##       0.065205163      -0.065383193      -0.003365206       0.071518211 
##  temperature_2m_7  temperature_2m_8  temperature_2m_9  wind_speed_10m_1 
##      -0.113065457      -0.180493808       0.059707567      -0.164441990 
## wind_speed_10m_10 wind_speed_10m_11 wind_speed_10m_12  wind_speed_10m_2 
##       0.003114987       0.384357765      -0.055134821      -0.023782619 
##  wind_speed_10m_3  wind_speed_10m_4  wind_speed_10m_5  wind_speed_10m_6 
##       1.706289993       0.149691920       0.032650884       0.067292309 
##  wind_speed_10m_7  wind_speed_10m_8  wind_speed_10m_9             pop5k 
##       1.038069185       0.183487893       0.054924121       2.764777646 
##             pop3k         ROAD_1_25         ROAD_1_50        ROAD_1_100 
##      18.187115129       0.092611059       0.202740416       0.248077246 
##        ROAD_1_300        ROAD_1_500        ROAD_1_800       ROAD_1_1000 
##       0.010326924       0.210433910       0.348406224       0.149252190 
##       ROAD_1_3000       ROAD_1_5000         ROAD_2_25         ROAD_2_50 
##       2.199535035       4.343129614       9.561830513      20.983413356 
##        ROAD_2_100        ROAD_2_300        ROAD_2_500        ROAD_2_800 
##       5.741654700       1.128635693       0.424655474       0.061223087 
##       ROAD_2_1000       ROAD_2_3000       ROAD_2_5000            I_1_25 
##      -0.220691413       1.071412913       0.168767210      -0.010741075 
##            I_1_50           I_1_100           I_1_300           I_1_500 
##       0.004740863       0.008445466      -0.023096609       0.107917128 
##           I_1_800          I_1_1000          I_1_3000          I_1_5000 
##      -0.062113316       0.256930650       0.564113069       1.734231839 
##      Tropomi_2018 
##       2.108224380
#vip
DF_P_r2 = vi(rf, method = "permute", target = "value_mean", metric = "r2" ) # very clear what method is used and what is the target
DF_P_rmse = vi(rf, method = "permute", target = "value_mean", metric = "rmse") 


vip (DF_P_rmse) 

vip (DF_P_r2)

with(inde_var, cor(ROAD_M345_3000, ROAD_2_50))
## [1] 0.3211617
with(inde_var, cor(ROAD_M345_3000, pop3k))
## [1] 0.8348217

partial dependence plots: all the variables. (using sparklines, takes a while)

a=add_sparklines(DF_P_r2, fit = rf)
saveWidget(a, file="sparkline.html")

Partial dependence plot of selected

pre_mat_s = inde_var_train %>% select(value_mean, ROAD_2_50, pop3k, ROAD_M345_300) 
lm_s = lm(value_mean~., data = pre_mat_s)
rf_s = ranger(value_mean~ ., data = pre_mat_s, num.trees = 2000, importance = "permutation")
rf_s
## Ranger result
## 
## Call:
##  ranger(value_mean ~ ., data = pre_mat_s, num.trees = 2000, importance = "permutation") 
## 
## Type:                             Regression 
## Number of trees:                  2000 
## Sample size:                      267 
## Number of independent variables:  3 
## Mtry:                             1 
## Target node size:                 5 
## Variable importance mode:         permutation 
## Splitrule:                        variance 
## OOB prediction error (MSE):       71.06144 
## R squared (OOB):                  0.5639948

correlation

pre_mat_predictor = pre_mat_s%>%select(-value_mean) 
ggpairs(pre_mat_predictor)

p_lm = partial(lm_s, "ROAD_M345_300",plot = TRUE, rug = TRUE)
plot(p_lm)

p2 = partial(rf_s, "ROAD_M345_300",plot = TRUE, rug = TRUE)
plot(p2)

#slow
pd <- partial(rf_s, pred.var = c("pop3k", "ROAD_M345_300"  ))

# Default PDP
pd1 = plotPartial(pd)

# Add contour lines and use a different color palette
rwb <- colorRampPalette(c("red", "white", "blue"))
pdp2 = plotPartial(pd, contour = TRUE, col.regions = rwb)
 
#3-D surface
#pdp3 <- plotPartial(pd, levelplot = F, zlab = "ROAD_1_50", colorkey = T, 
#                   screen = list(z = -20, x = -60) )

p3 = partial(rf_s, "ROAD_2_50", plot = TRUE, rug = TRUE)
p1 = partial(rf_s, "pop3k", plot = TRUE, rug = TRUE)
grid.arrange(p1, p2, p3, pd1, pdp2, ncol = 2)

Gradient boosting

pre_mat = subset_grep(inde_var_train, grepstring = "ROAD|pop|value_mean|temp|wind|eleva|coast|I_1|Trop")

gbm1 =  gbm(formula = value_mean~., data = pre_mat, distribution = "gaussian", n.trees = 2000,
  interaction.depth = 6, shrinkage = 0.01, bag.fraction = 0.5 )
names(pre_mat)
##  [1] "ROAD_M345_25"      "ROAD_M345_50"      "ROAD_M345_100"    
##  [4] "ROAD_M345_300"     "ROAD_M345_500"     "ROAD_M345_800"    
##  [7] "ROAD_M345_1000"    "ROAD_M345_3000"    "ROAD_M345_5000"   
## [10] "value_mean"        "elevation"         "pop1k"            
## [13] "temperature_2m_1"  "temperature_2m_10" "temperature_2m_11"
## [16] "temperature_2m_12" "temperature_2m_2"  "temperature_2m_3" 
## [19] "temperature_2m_4"  "temperature_2m_5"  "temperature_2m_6" 
## [22] "temperature_2m_7"  "temperature_2m_8"  "temperature_2m_9" 
## [25] "wind_speed_10m_1"  "wind_speed_10m_10" "wind_speed_10m_11"
## [28] "wind_speed_10m_12" "wind_speed_10m_2"  "wind_speed_10m_3" 
## [31] "wind_speed_10m_4"  "wind_speed_10m_5"  "wind_speed_10m_6" 
## [34] "wind_speed_10m_7"  "wind_speed_10m_8"  "wind_speed_10m_9" 
## [37] "pop5k"             "pop3k"             "ROAD_1_25"        
## [40] "ROAD_1_50"         "ROAD_1_100"        "ROAD_1_300"       
## [43] "ROAD_1_500"        "ROAD_1_800"        "ROAD_1_1000"      
## [46] "ROAD_1_3000"       "ROAD_1_5000"       "ROAD_2_25"        
## [49] "ROAD_2_50"         "ROAD_2_100"        "ROAD_2_300"       
## [52] "ROAD_2_500"        "ROAD_2_800"        "ROAD_2_1000"      
## [55] "ROAD_2_3000"       "ROAD_2_5000"       "I_1_25"           
## [58] "I_1_50"            "I_1_100"           "I_1_300"          
## [61] "I_1_500"           "I_1_800"           "I_1_1000"         
## [64] "I_1_3000"          "I_1_5000"          "Tropomi_2018"
summary(gbm1)

##                                 var      rel.inf
## ROAD_M345_3000       ROAD_M345_3000 18.729273843
## pop3k                         pop3k  5.677425187
## ROAD_2_50                 ROAD_2_50  5.560551142
## ROAD_1_5000             ROAD_1_5000  5.419704602
## pop1k                         pop1k  4.765462300
## ROAD_M345_300         ROAD_M345_300  4.591490062
## ROAD_M345_1000       ROAD_M345_1000  3.261040695
## pop5k                         pop5k  3.002087174
## ROAD_M345_5000       ROAD_M345_5000  2.868400699
## ROAD_M345_100         ROAD_M345_100  2.721436800
## ROAD_1_3000             ROAD_1_3000  2.578866762
## ROAD_2_25                 ROAD_2_25  2.516762791
## ROAD_M345_800         ROAD_M345_800  2.261027153
## Tropomi_2018           Tropomi_2018  2.258823583
## ROAD_2_3000             ROAD_2_3000  2.079711472
## ROAD_M345_500         ROAD_M345_500  2.071979514
## ROAD_M345_25           ROAD_M345_25  1.921521160
## I_1_1000                   I_1_1000  1.660720158
## I_1_5000                   I_1_5000  1.642287884
## I_1_3000                   I_1_3000  1.641833592
## ROAD_2_100               ROAD_2_100  1.408648391
## ROAD_M345_50           ROAD_M345_50  1.240984174
## ROAD_2_5000             ROAD_2_5000  1.181191022
## wind_speed_10m_7   wind_speed_10m_7  1.155505284
## ROAD_2_300               ROAD_2_300  1.100270198
## wind_speed_10m_3   wind_speed_10m_3  1.063805042
## ROAD_2_500               ROAD_2_500  1.038646388
## ROAD_1_800               ROAD_1_800  0.901051114
## elevation                 elevation  0.847960485
## I_1_800                     I_1_800  0.839865882
## ROAD_2_1000             ROAD_2_1000  0.760772821
## ROAD_1_300               ROAD_1_300  0.713432648
## temperature_2m_8   temperature_2m_8  0.688021738
## temperature_2m_3   temperature_2m_3  0.629102785
## ROAD_2_800               ROAD_2_800  0.585293105
## ROAD_1_1000             ROAD_1_1000  0.554155408
## wind_speed_10m_11 wind_speed_10m_11  0.529279905
## temperature_2m_4   temperature_2m_4  0.512120369
## temperature_2m_5   temperature_2m_5  0.507692730
## wind_speed_10m_1   wind_speed_10m_1  0.490299279
## I_1_500                     I_1_500  0.435542039
## temperature_2m_9   temperature_2m_9  0.413218205
## wind_speed_10m_12 wind_speed_10m_12  0.401429820
## wind_speed_10m_4   wind_speed_10m_4  0.390885695
## wind_speed_10m_5   wind_speed_10m_5  0.359105800
## ROAD_1_100               ROAD_1_100  0.358750802
## temperature_2m_2   temperature_2m_2  0.356378886
## wind_speed_10m_9   wind_speed_10m_9  0.348008155
## wind_speed_10m_2   wind_speed_10m_2  0.345129777
## temperature_2m_7   temperature_2m_7  0.327913314
## wind_speed_10m_8   wind_speed_10m_8  0.320314955
## temperature_2m_12 temperature_2m_12  0.310940087
## ROAD_1_500               ROAD_1_500  0.297104796
## temperature_2m_6   temperature_2m_6  0.292189587
## temperature_2m_1   temperature_2m_1  0.219371740
## temperature_2m_10 temperature_2m_10  0.200804513
## wind_speed_10m_6   wind_speed_10m_6  0.194312135
## temperature_2m_11 temperature_2m_11  0.187023847
## wind_speed_10m_10 wind_speed_10m_10  0.147662978
## I_1_300                     I_1_300  0.105714225
## I_1_100                     I_1_100  0.006467531
## I_1_50                       I_1_50  0.003225773
## ROAD_1_25                 ROAD_1_25  0.000000000
## ROAD_1_50                 ROAD_1_50  0.000000000
## I_1_25                       I_1_25  0.000000000
plot(gbm1, i.var = 2:3)

#plot(gbm1, i.var = 1) 
#rf_residual <- pre_rf -  rdf_test$NO2

Xgboost

Tunning XGBoost is more complex (as it has a lot more hyperparameters to tune): https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

  1. gamma[default=0][range: (0,Inf)] It controls regularization (or prevents overfitting). The optimal value of gamma depends on the data set and other parameter values. Higher the value, higher the regularization. Regularization means penalizing large coefficients which don’t improve the model’s performance. default = 0 means no regularization. Tune trick: Start with 0 and check CV error rate. If you see train error >>> test error, bring gamma into action.

  2. lambda and Alpha

  3. nrounds[default=100] It controls the maximum number of iterations. For classification, it is similar to the number of trees to grow. Should be tuned using CV

  4. eta[default=0.3][range: (0,1)] It controls the learning rate, i.e., the rate at which our model learns patterns in data. After every round, it shrinks the feature weights to reach the best optimum. Lower eta leads to slower computation. It must be supported by increase in nrounds. Typically, it lies between 0.01 - 0.3

  5. max_depth[default=6][range: (0,Inf)] It controls the depth of the tree. Larger data sets require deep trees to learn the rules from data. Should be tuned using CV

  y_varname= "day_value"
  varstring = "ROAD|pop|temp|wind|RSp|OMI|eleva|coast|I_1|Tropo"
  prenres = paste(y_varname, "|", varstring, sep = "")
  sub_mat = subset_grep(inde_var, prenres)
    
  x_train = sub_mat[training, ]
  y_train = sub_mat[training, y_varname]
    
  x_test = sub_mat[test, ]
  y_test = sub_mat[test, y_varname]
 
  df_train = data.table(x_train, keep.rownames = F)
  df_test = data.table(x_test, keep.rownames = F)
  formu = as.formula(paste(y_varname, "~.", sep = ""))
  dfmatrix_train = sparse.model.matrix(formu, data = df_train)[, -1]
  dfmatrix_test = sparse.model.matrix(formu, data = df_test)[, -1]
 
  train_b = xgb.DMatrix(data = dfmatrix_train, label = y_train) 
  test_b = xgb.DMatrix(data = dfmatrix_test, label = y_test) 
  #params <- list(booster = "gbtree",max_depth = 4,
  #eta = 0.05,
  #nthread = 2,
  #nrounds = 1000,
  #Gamma = 2)
  #xgb_t = xgb.train (params = params, data = train_b, nrounds = 500, watchlist = list(val=test_b, train = train_b), print_every_n = 10, early_stopping_rounds = 50, maximize = F , eval_metric = "rmse")

  #outputvec = inde_var[training, y_varname]
  max_depth = 4
  eta = 0.01
  nthread = 4
  nrounds = 1000
  Gamma = 2
  
  #simplest: tunning of rounds 
  xgbcv <- xgb.cv( data = train_b, nfold = 10, nround =  nrounds, eta = eta, nthread = nthread, Gamma = Gamma,showsd = T, stratified = T, print_every_n = 200, early_stopping_rounds = 50, maximize = F)
## [1]  train-rmse:28.865029+0.208921   test-rmse:28.843940+1.851794 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 50 rounds.
## 
## [201]    train-rmse:5.709286+0.087616    test-rmse:11.002866+1.877998 
## [401]    train-rmse:1.616282+0.072362    test-rmse:9.996568+1.508015 
## Stopping. Best iteration:
## [461]    train-rmse:1.217682+0.071539    test-rmse:9.975331+1.469296
  xgbcv
## ##### xgb.cv 10-folds
##     iter train_rmse_mean train_rmse_std test_rmse_mean test_rmse_std
##        1      28.8650288     0.20892099      28.843940      1.851794
##        2      28.6086891     0.20705758      28.598292      1.851333
##        3      28.3548025     0.20515050      28.357452      1.850509
##        4      28.1035497     0.20331562      28.120897      1.846814
##        5      27.8548682     0.20155925      27.882485      1.847612
## ---                                                                 
##      507       1.0080563     0.06842046       9.979148      1.440676
##      508       1.0043468     0.06821342       9.979493      1.439730
##      509       1.0004766     0.06814699       9.979335      1.438870
##      510       0.9967225     0.06796196       9.979337      1.438407
##      511       0.9928339     0.06810732       9.979101      1.438040
## Best iteration:
##  iter train_rmse_mean train_rmse_std test_rmse_mean test_rmse_std
##   461        1.217682     0.07153884       9.975331      1.469296
  str(xgbcv)
## List of 9
##  $ call           : language xgb.cv(data = train_b, nrounds = nrounds, nfold = 10, showsd = T,      stratified = T, print_every_n = 200, early| __truncated__ ...
##  $ params         :List of 4
##   ..$ eta    : num 0.01
##   ..$ nthread: num 4
##   ..$ Gamma  : num 2
##   ..$ silent : num 1
##  $ callbacks      :List of 3
##   ..$ cb.print.evaluation:function (env = parent.frame())  
##   .. ..- attr(*, "call")= language cb.print.evaluation(period = print_every_n, showsd = showsd)
##   .. ..- attr(*, "name")= chr "cb.print.evaluation"
##   ..$ cb.evaluation.log  :function (env = parent.frame(), finalize = FALSE)  
##   .. ..- attr(*, "call")= language cb.evaluation.log()
##   .. ..- attr(*, "name")= chr "cb.evaluation.log"
##   ..$ cb.early.stop      :function (env = parent.frame(), finalize = FALSE)  
##   .. ..- attr(*, "call")= language cb.early.stop(stopping_rounds = early_stopping_rounds, maximize = maximize,      verbose = verbose)
##   .. ..- attr(*, "name")= chr "cb.early.stop"
##  $ evaluation_log :Classes 'data.table' and 'data.frame':    511 obs. of  5 variables:
##   ..$ iter           : num [1:511] 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ train_rmse_mean: num [1:511] 28.9 28.6 28.4 28.1 27.9 ...
##   ..$ train_rmse_std : num [1:511] 0.209 0.207 0.205 0.203 0.202 ...
##   ..$ test_rmse_mean : num [1:511] 28.8 28.6 28.4 28.1 27.9 ...
##   ..$ test_rmse_std  : num [1:511] 1.85 1.85 1.85 1.85 1.85 ...
##   ..- attr(*, ".internal.selfref")=<externalptr> 
##  $ niter          : int 511
##  $ nfeatures      : int 67
##  $ folds          :List of 10
##   ..$ : int [1:26] 99 49 148 193 65 257 16 10 27 169 ...
##   ..$ : int [1:26] 95 259 46 137 167 183 110 207 7 155 ...
##   ..$ : int [1:26] 103 54 147 214 15 154 29 125 129 243 ...
##   ..$ : int [1:26] 108 22 237 98 198 251 165 72 12 78 ...
##   ..$ : int [1:26] 190 63 11 256 140 17 56 94 187 168 ...
##   ..$ : int [1:26] 206 132 241 133 246 64 231 77 116 156 ...
##   ..$ : int [1:26] 176 218 123 255 113 220 26 199 90 138 ...
##   ..$ : int [1:26] 192 83 224 182 75 44 180 85 55 57 ...
##   ..$ : int [1:26] 233 178 24 53 185 96 235 82 158 191 ...
##   ..$ : int [1:33] 86 238 81 170 248 157 260 144 88 102 ...
##  $ best_iteration : int 461
##  $ best_ntreelimit: num 461
##  - attr(*, "class")= chr "xgb.cv.synchronous"
  bst <- xgboost(data = train_b, max_depth = max_depth, eta = eta, nthread = nthread, nrounds = xgbcv$best_iteration, Gamma = Gamma, verbose = 1, print_every_n = 200, early_stopping_rounds = 50, maximize = F )
## [1]  train-rmse:28.866821 
## Will train until train_rmse hasn't improved in 50 rounds.
## 
## [201]    train-rmse:6.760702 
## [401]    train-rmse:3.562510 
## [461]    train-rmse:3.131862
  xgbpre = predict(bst, test_b)
  error_matrix(y_test, xgbpre)
##      RMSE       MAE       IQR 
## 11.679237  8.006234  9.234470
varstring = "ROAD|pop|temp|wind|RSp|OMI|eleva|coast|I_1|Tropo"
xgboost_LUR(inde_var, max_depth =4, eta =0.02, nthread = 2, nrounds = 2000, y_varname= c("day_value"),training = training, test = test, grepstring = varstring )
##            Feature       Gain       Cover   Frequency
##  1: ROAD_M345_3000 0.43568940 0.017184898 0.023145552
##  2:      ROAD_2_50 0.11319873 0.011145687 0.009380576
##  3:  ROAD_M345_300 0.04512562 0.062692837 0.059240377
##  4:          pop3k 0.04126691 0.005721383 0.006270711
##  5:    ROAD_1_5000 0.04002937 0.027056442 0.027835840
##  6:   ROAD_M345_25 0.03055334 0.025871423 0.090033138
##  7:          pop1k 0.02869213 0.007340705 0.011419832
##  8:    ROAD_2_3000 0.01867281 0.042938904 0.031455519
##  9:      ROAD_1_50 0.01801472 0.007844326 0.003619679
## 10:  ROAD_M345_100 0.01577400 0.036047120 0.052153964
## 11:     ROAD_2_300 0.01335612 0.018889572 0.014784604
## 12:   ROAD_M345_50 0.01236051 0.048190146 0.063420851
## 13:       I_1_5000 0.01200787 0.043426492 0.032475147
## 14:       I_1_3000 0.01162941 0.053990751 0.041040020
## 15:    ROAD_1_3000 0.01159760 0.015211438 0.020086668
##      RMSE       MAE       IQR 
## 11.650457  8.187369  9.264056
#xgboost_imp (variabledf = inde_var, y_varname = "day_value", max_depth = 5, eta = 0.02, nthread = 4, nrounds = 2000, training = training, test = test, grepstring = varstring )

spatial correlation of errors of random forest

set.seed(2)
pr = globalLUR::sampledf(merged_mr, fraction = 0.8, country2digit = "CN", grepstring_rm = "ID|countryfullname")
inde_var_train = with(pr, inde_var[training, ])
inde_var_test = with(pr, inde_var[test, ])
pre_mat = subset_grep(inde_var_train, grepstring = "ROAD|pop|value_mean|temp|wind|eleva|coast|I_1|Trop")
rf = ranger(value_mean~ ., data = pre_mat, mtry = 33, num.trees = 2000, importance = "permutation")
rf
## Ranger result
## 
## Call:
##  ranger(value_mean ~ ., data = pre_mat, mtry = 33, num.trees = 2000,      importance = "permutation") 
## 
## Type:                             Regression 
## Number of trees:                  2000 
## Sample size:                      1009 
## Number of independent variables:  65 
## Mtry:                             33 
## Target node size:                 5 
## Variable importance mode:         permutation 
## Splitrule:                        variance 
## OOB prediction error (MSE):       33.45173 
## R squared (OOB):                  0.7693244
errordf = with(inde_var_test, data.frame(error = predictions(predict(rf, inde_var_test)) - value_mean, LONGITUDE = LONGITUDE, LATITUDE = LATITUDE))
                 
error_sp = errordf %>% st_as_sf(coords = c("LONGITUDE","LATITUDE")) %>% as_Spatial
dt.vgm = variogram(error~1, error_sp, cutoff = 1)
plot(dt.vgm)

LASSO
Lasso(inde_var, vis1 = T, y_varname = "day_value", training = training, test=test)
## [1] 0.2969377
## [1] 1.73917

##      RMSE       MAE       IQR 
## 11.346850  8.095768 12.054876
Lasso(inde_var, vis1 = T, y_varname = "night_value", training = training, test=test)
## [1] 0.2582184
## [1] 0.9498254

##     RMSE      MAE      IQR 
## 7.037871 4.786726 7.799746